{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vectorizer = CountVectorizer()\n",
    "\n",
    "corpus = [\n",
    "    '语言 是 人类 区别 其他 动物 的 本质 特性。',\n",
    "    '在 所有 生物 中，只有 人类 才 具有 语言 能力。',\n",
    "'人类 的 多种 智能 都 与 语言 有着 密切 的 关系。',\n",
    "'人类 的 逻辑 思维 以 语言 为 形式 ， 人类 的 绝大 部分 知识 也是 以 语言 文字 的 形式 记载 和 流传 下来 的。',\n",
    "'因而，它 也是 人工智能 的 一个 重要 ， 甚至 核心 部分。'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
       "        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0],\n",
       "       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],\n",
       "       [0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0,\n",
       "        1, 0, 0, 0, 1, 1, 0, 1, 2, 1, 1, 0],\n",
       "       [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n",
       "        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X=vectorizer.fit_transform(corpus)\n",
    "X.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'语言': 30,\n",
       " '人类': 4,\n",
       " '区别': 9,\n",
       " '其他': 6,\n",
       " '动物': 8,\n",
       " '本质': 20,\n",
       " '特性': 23,\n",
       " '所有': 16,\n",
       " '生物': 25,\n",
       " '只有': 10,\n",
       " '具有': 7,\n",
       " '能力': 28,\n",
       " '多种': 12,\n",
       " '智能': 18,\n",
       " '有着': 19,\n",
       " '密切': 13,\n",
       " '关系': 5,\n",
       " '逻辑': 31,\n",
       " '思维': 15,\n",
       " '形式': 14,\n",
       " '绝大': 27,\n",
       " '部分': 32,\n",
       " '知识': 26,\n",
       " '也是': 2,\n",
       " '文字': 17,\n",
       " '记载': 29,\n",
       " '流传': 22,\n",
       " '下来': 1,\n",
       " '因而': 11,\n",
       " '人工智能': 3,\n",
       " '一个': 0,\n",
       " '重要': 33,\n",
       " '甚至': 24,\n",
       " '核心': 21}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zh-cn\n",
      "en\n",
      "[en:0.9999962075788779]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'本篇博客主要介绍两款语言探测工具，用于区分文本到底是什么语言，We are pleased to introduce today a new technology *&*$#$&@##@@@#!@@#!@#'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#探测是什么语言\n",
    "\n",
    "from langdetect import detect\n",
    "from langdetect import detect_langs\n",
    "\n",
    "s1 = \"本篇博客主要介绍两款语言探测工具，用于区分文本到底是什么语言，\"\n",
    "s2 = 'We are pleased to introduce today a new technology *&*$#$&@##@@@#!@@#!@#'\n",
    "s3 = s1+s2\n",
    "print(detect(s1))\n",
    "print(detect(s2))\n",
    "print(detect_langs(s3))    # detect_langs()输出探测出的所有语言类型及其所占的比例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
